library(tidyverse)
library(openintro)
library(infer)
library(psych)
library(gghighlight)yrbss %>%
summary## age gender grade hispanic
## Min. :12.00 Length:13583 Length:13583 Length:13583
## 1st Qu.:15.00 Class :character Class :character Class :character
## Median :16.00 Mode :character Mode :character Mode :character
## Mean :16.16
## 3rd Qu.:17.00
## Max. :18.00
## NA's :77
## race height weight helmet_12m
## Length:13583 Min. :1.270 Min. : 29.94 Length:13583
## Class :character 1st Qu.:1.600 1st Qu.: 56.25 Class :character
## Mode :character Median :1.680 Median : 64.41 Mode :character
## Mean :1.691 Mean : 67.91
## 3rd Qu.:1.780 3rd Qu.: 76.20
## Max. :2.110 Max. :180.99
## NA's :1004 NA's :1004
## text_while_driving_30d physically_active_7d hours_tv_per_school_day
## Length:13583 Min. :0.000 Length:13583
## Class :character 1st Qu.:2.000 Class :character
## Mode :character Median :4.000 Mode :character
## Mean :3.903
## 3rd Qu.:7.000
## Max. :7.000
## NA's :273
## strength_training_7d school_night_hours_sleep
## Min. :0.00 Length:13583
## 1st Qu.:0.00 Class :character
## Median :3.00 Mode :character
## Mean :2.95
## 3rd Qu.:5.00
## Max. :7.00
## NA's :1176
yrbss %>%
count(text_while_driving_30d)## # A tibble: 9 × 2
## text_while_driving_30d n
## <chr> <int>
## 1 0 4792
## 2 1-2 925
## 3 10-19 373
## 4 20-29 298
## 5 3-5 493
## 6 30 827
## 7 6-9 311
## 8 did not drive 4646
## 9 <NA> 918
getProportions <- function(data) {
data <- data %>%
select(text_ind) %>%
count(text_ind) %>%
mutate(p = n / sum(n)) %>%
select(text_ind, n, p)
print(data)
return(data)
}neverType = "never"
type30 = "30"
yesLabel = "yes"
noLabel = "no"
nLabel = "n"
no_helmet <- yrbss %>%
filter(helmet_12m == neverType) %>%
mutate(text_ind = ifelse(!is.na(text_while_driving_30d)
& text_while_driving_30d == type30, yesLabel, noLabel))
no_helmetProportions <- no_helmet %>%
getProportions## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 6514 0.934
## 2 yes 463 0.0664
#overall <- yrbss %>%
# mutate(text_ind = ifelse(!is.na(text_while_driving_30d)
# & text_while_driving_30d == type30
# & !is.na(helmet_12m)
# & helmet_12m == neverType, yesLabel, noLabel)) %>%
# getProportionsanalyzeConfidenceInterval <- function(sample, reps = 1000, yes = TRUE, level = 0.95, print = TRUE) {
interval <- sample %>%
specify(response = text_ind, success = ifelse (yes, yesLabel, noLabel)) %>%
generate(reps = reps, type = "bootstrap") %>%
calculate(stat = "prop") %>%
get_ci(level = level)
if (print) {
print(interval)
print(c(interval$upper_ci, interval$lower_ci) %>%
describe)
}
return(interval)
}analyzeSamplingProportionDistribution <- function(sample, size = NULL, reps = 15000, binwidth = .01, yes = TRUE) {
size <- ifelse(is.null(size), nrow(sample), size)
sizedSamples <- sample %>%
rep_sample_n(size = size, reps = reps, replace = TRUE) %>%
count(text_ind) %>%
mutate(p_hat = n /sum(n))
typeLabel <- ifelse(yes, yesLabel, notLabel)
filteredSamples <- sizedSamples %>%
filter(text_ind == typeLabel)
print(ggplot(data = filteredSamples, aes(x = p_hat)) +
geom_histogram(binwidth = binwidth) +
labs(
x = paste("p_hat (", typeLabel, ")", sep = ""),
title = "Sampling distribution of p_hat",
subtitle = paste("Sample size = ", size, "; Number of samples = ", reps, "; Bin width = ", binwidth, sep = "")
))
print(filteredSamples)
print(filteredSamples$p_hat %>%
describe)
print(sizedSamples$n %>%
sum)
return(sizedSamples)
}analyzeMarginOfError <- function(n, p, decimals = 3) {
me <- 2 * sqrt(p * (1 - p) / n)
print(paste("Margin of Error: ", me, sep = ""))
binwidth = 1/ 10 ^ decimals
pList <- seq(from = 0, to = 1, by = binwidth)
meList <- 2 * sqrt(pList * (1 - pList) / n)
dd <- data.frame(proportion = pList, marginOfError = meList)
print(ggplot(data = dd, aes(x = proportion, y = marginOfError)) +
geom_point() +
gghighlight(proportion == round(p, digits = decimals), label_key = marginOfError) +
labs(x = "Population Proportion",
y = "Margin of Error",
title = paste("Sample size = ", n, "; Proportion = ", p, "; Bin width = ", binwidth, sep = "")))
return(me)
}
analyzeMarginOfErrorData <- function(data, row = "yes", size = NULL, reps = 15000, binwidth = .01, yes = TRUE, distribute = TRUE) {
filteredData <- data %>%
getProportions %>%
filter(text_ind == row)
me <- analyzeMarginOfError(filteredData[[2]], filteredData[[3]])
if(distribute) {
proportionDistribution <- data %>%
analyzeSamplingProportionDistribution(size, reps, binwidth, yes)
}
return(me)
}me <- no_helmet %>%
analyzeMarginOfErrorData(distribute = FALSE)## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 6514 0.934
## 2 yes 463 0.0664
## [1] "Margin of Error: 0.0231358334802707"
interval95 <- no_helmet %>%
analyzeConfidenceInterval## # A tibble: 1 × 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.0608 0.0727
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 2 0.07 0.01 0.07 0.07 0.01 0.06 0.07 0.01 0 -2.75 0.01
doNotWatchType = "do not watch"
type7 = "7"
noTVWorkoutEveryDay <- yrbss %>%
filter(hours_tv_per_school_day == doNotWatchType) %>%
mutate(text_ind = ifelse(!is.na(strength_training_7d)
& strength_training_7d == type7, yesLabel, noLabel))
me <- noTVWorkoutEveryDay %>%
analyzeMarginOfErrorData(distribute = FALSE)## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 1578 0.858
## 2 yes 262 0.142
## [1] "Margin of Error: 0.0431783131870461"
interval95 <- noTVWorkoutEveryDay %>%
analyzeConfidenceInterval## # A tibble: 1 × 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.127 0.158
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 2 0.14 0.02 0.14 0.14 0.02 0.13 0.16 0.03 0 -2.75 0.02
the margin of error increases starting from 0 at 0% proportion until it reaches 50% where it achieves its max before decreasing back down to 0 at 100%
The resulting distribution is unimodal and without a skew centered around the given proportion
numOfElements = 300
proportion = .1
data = tibble(
text_ind = c(rep(yesLabel, numOfElements * proportion), rep(noLabel, numOfElements * (1 - proportion)))
)
me <- data %>%
analyzeMarginOfErrorData## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 270 0.9
## 2 yes 30 0.1
## [1] "Margin of Error: 0.109544511501033"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 30 0.1
## 2 2 yes 35 0.117
## 3 3 yes 23 0.0767
## 4 4 yes 26 0.0867
## 5 5 yes 28 0.0933
## 6 6 yes 32 0.107
## 7 7 yes 31 0.103
## 8 8 yes 21 0.07
## 9 9 yes 33 0.11
## 10 10 yes 31 0.103
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.1 0.02 0.1 0.1 0.02 0.04 0.17 0.13 0.14 0 0
## [1] 4500000
The shape and center remains fairly consistent as unimodal and without a skew centered around the given proportion, but the margin of error increases starting from 0 at 0% proportion until it reaches 50% where it achieves its max before decreasing back down to 0 at 100%
minisculeProportion = .01
tinyProportion = .2
smallProportion = .40
mediumProportion = .60
largeProportion = .80
massiveProportion = .99
smallBinwidth = .001
minisculeData = tibble(
text_ind = c(rep(yesLabel, numOfElements * minisculeProportion), rep(noLabel, numOfElements * (1 - minisculeProportion)))
)
tinyData = tibble(
text_ind = c(rep(yesLabel, numOfElements * tinyProportion), rep(noLabel, numOfElements * (1 - tinyProportion)))
)
smallData = tibble(
text_ind = c(rep(yesLabel, numOfElements * smallProportion), rep(noLabel, numOfElements * (1 - smallProportion)))
)
mediumData = tibble(
text_ind = c(rep(yesLabel, numOfElements * mediumProportion), rep(noLabel, numOfElements * (1 - mediumProportion)))
)
largeData = tibble(
text_ind = c(rep(yesLabel, numOfElements * largeProportion), rep(noLabel, numOfElements * (1 - largeProportion)))
)
massiveData = tibble(
text_ind = c(rep(yesLabel, numOfElements * massiveProportion), rep(noLabel, numOfElements * (1 - massiveProportion)))
)
me <- minisculeData %>%
analyzeMarginOfErrorData(binwidth = smallBinwidth)## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 297 0.99
## 2 yes 3 0.01
## [1] "Margin of Error: 0.114891252930761"
## # A tibble: 14,252 × 4
## # Groups: replicate [14,252]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 2 0.00667
## 2 2 yes 2 0.00667
## 3 3 yes 1 0.00333
## 4 4 yes 7 0.0233
## 5 5 yes 6 0.02
## 6 6 yes 3 0.01
## 7 7 yes 1 0.00333
## 8 8 yes 5 0.0167
## 9 9 yes 4 0.0133
## 10 10 yes 6 0.02
## # … with 14,242 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 14252 0.01 0.01 0.01 0.01 0 0 0.04 0.04 0.74 0.49 0
## [1] 4500000
me <- tinyData %>%
analyzeMarginOfErrorData## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 240 0.8
## 2 yes 60 0.2
## [1] "Margin of Error: 0.103279555898864"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 49 0.163
## 2 2 yes 62 0.207
## 3 3 yes 65 0.217
## 4 4 yes 57 0.19
## 5 5 yes 56 0.187
## 6 6 yes 62 0.207
## 7 7 yes 64 0.213
## 8 8 yes 60 0.2
## 9 9 yes 64 0.213
## 10 10 yes 71 0.237
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.2 0.02 0.2 0.2 0.02 0.12 0.3 0.19 0.13 0.05 0
## [1] 4500000
me <- smallData %>%
analyzeMarginOfErrorData## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 180 0.6
## 2 yes 120 0.4
## [1] "Margin of Error: 0.0894427190999916"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 146 0.487
## 2 2 yes 121 0.403
## 3 3 yes 125 0.417
## 4 4 yes 142 0.473
## 5 5 yes 123 0.41
## 6 6 yes 119 0.397
## 7 7 yes 110 0.367
## 8 8 yes 132 0.44
## 9 9 yes 116 0.387
## 10 10 yes 109 0.363
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.4 0.03 0.4 0.4 0.03 0.3 0.51 0.22 0.06 0 0
## [1] 4500000
me <- mediumData %>%
analyzeMarginOfErrorData## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 120 0.4
## 2 yes 180 0.6
## [1] "Margin of Error: 0.0730296743340221"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 183 0.61
## 2 2 yes 176 0.587
## 3 3 yes 193 0.643
## 4 4 yes 185 0.617
## 5 5 yes 183 0.61
## 6 6 yes 178 0.593
## 7 7 yes 196 0.653
## 8 8 yes 185 0.617
## 9 9 yes 166 0.553
## 10 10 yes 181 0.603
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.6 0.03 0.6 0.6 0.03 0.49 0.7 0.21 -0.03 0.01 0
## [1] 4500000
me <- largeData %>%
analyzeMarginOfErrorData## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 59 0.197
## 2 yes 240 0.803
## [1] "Margin of Error: 0.0513789013235358"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 246 0.823
## 2 2 yes 234 0.783
## 3 3 yes 232 0.776
## 4 4 yes 240 0.803
## 5 5 yes 238 0.796
## 6 6 yes 240 0.803
## 7 7 yes 226 0.756
## 8 8 yes 237 0.793
## 9 9 yes 235 0.786
## 10 10 yes 246 0.823
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.8 0.02 0.8 0.8 0.02 0.71 0.89 0.18 -0.08 0.03 0
## [1] 4485000
me <- massiveData %>%
analyzeMarginOfErrorData(binwidth = smallBinwidth)## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 3 0.01
## 2 yes 297 0.99
## [1] "Margin of Error: 0.0115470053837925"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 295 0.983
## 2 2 yes 296 0.987
## 3 3 yes 296 0.987
## 4 4 yes 296 0.987
## 5 5 yes 296 0.987
## 6 6 yes 297 0.99
## 7 7 yes 299 0.997
## 8 8 yes 297 0.99
## 9 9 yes 298 0.993
## 10 10 yes 297 0.99
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.99 0.01 0.99 0.99 0 0.96 1 0.04 -0.56 0.24 0
## [1] 4500000
The margin of error decreases as n increases
tinyNumOfElements = 10
smallNumOfElements = 50
mediumNumOfElements = 100
largeNumOfElements = 1000
massiveNumOfElements = 10000
largeBinwidth = .1
mediumBinwidth = .02
tinyData = tibble(
text_ind = c(rep(yesLabel, tinyNumOfElements * proportion), rep(noLabel, tinyNumOfElements * (1 - proportion)))
)
smallData = tibble(
text_ind = c(rep(yesLabel, smallNumOfElements * proportion), rep(noLabel, smallNumOfElements * (1 - proportion)))
)
mediumData = tibble(
text_ind = c(rep(yesLabel, mediumNumOfElements * proportion), rep(noLabel, mediumNumOfElements * (1 - proportion)))
)
largeData = tibble(
text_ind = c(rep(yesLabel, largeNumOfElements * proportion), rep(noLabel, largeNumOfElements * (1 - proportion)))
)
massiveData = tibble(
text_ind = c(rep(yesLabel, massiveNumOfElements * proportion), rep(noLabel, massiveNumOfElements * (1 - proportion)))
)
me <- tinyData %>%
analyzeMarginOfErrorData(binwidth = largeBinwidth)## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 9 0.9
## 2 yes 1 0.1
## [1] "Margin of Error: 0.6"
## # A tibble: 9,871 × 4
## # Groups: replicate [9,871]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 1 0.1
## 2 2 yes 2 0.2
## 3 3 yes 2 0.2
## 4 4 yes 1 0.1
## 5 6 yes 1 0.1
## 6 7 yes 1 0.1
## 7 8 yes 3 0.3
## 8 9 yes 3 0.3
## 9 11 yes 2 0.2
## 10 13 yes 1 0.1
## # … with 9,861 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 9871 0.15 0.07 0.1 0.14 0 0.1 0.6 0.5 1.37 1.66 0
## [1] 150000
me <- smallData %>%
analyzeMarginOfErrorData(binwidth = mediumBinwidth)## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 45 0.9
## 2 yes 5 0.1
## [1] "Margin of Error: 0.268328157299975"
## # A tibble: 14,918 × 4
## # Groups: replicate [14,918]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 2 0.04
## 2 2 yes 6 0.12
## 3 3 yes 4 0.08
## 4 4 yes 3 0.06
## 5 5 yes 9 0.18
## 6 6 yes 5 0.1
## 7 7 yes 7 0.14
## 8 8 yes 8 0.16
## 9 9 yes 2 0.04
## 10 10 yes 7 0.14
## # … with 14,908 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 14918 0.1 0.04 0.1 0.1 0.03 0.02 0.36 0.34 0.42 0.12 0
## [1] 750000
me <- mediumData %>%
analyzeMarginOfErrorData## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 90 0.9
## 2 yes 10 0.1
## [1] "Margin of Error: 0.189736659610103"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 14 0.14
## 2 2 yes 15 0.15
## 3 3 yes 11 0.11
## 4 4 yes 6 0.06
## 5 5 yes 6 0.06
## 6 6 yes 9 0.09
## 7 7 yes 16 0.16
## 8 8 yes 5 0.05
## 9 9 yes 9 0.09
## 10 10 yes 11 0.11
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.1 0.03 0.1 0.1 0.03 0.01 0.24 0.23 0.33 0.14 0
## [1] 1500000
me <- largeData %>%
analyzeMarginOfErrorData(binwidth = smallBinwidth)## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 900 0.9
## 2 yes 100 0.1
## [1] "Margin of Error: 0.06"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 98 0.098
## 2 2 yes 110 0.11
## 3 3 yes 102 0.102
## 4 4 yes 83 0.083
## 5 5 yes 81 0.081
## 6 6 yes 92 0.092
## 7 7 yes 84 0.084
## 8 8 yes 92 0.092
## 9 9 yes 111 0.111
## 10 10 yes 108 0.108
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.1 0.01 0.1 0.1 0.01 0.07 0.14 0.07 0.06 0.01 0
## [1] 15000000
me <- massiveData %>%
analyzeMarginOfErrorData(binwidth = smallBinwidth)## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 9000 0.9
## 2 yes 1000 0.1
## [1] "Margin of Error: 0.0189736659610103"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 1033 0.103
## 2 2 yes 1043 0.104
## 3 3 yes 1015 0.102
## 4 4 yes 1019 0.102
## 5 5 yes 1043 0.104
## 6 6 yes 1008 0.101
## 7 7 yes 998 0.0998
## 8 8 yes 949 0.0949
## 9 9 yes 1043 0.104
## 10 10 yes 1012 0.101
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.1 0 0.1 0.1 0 0.09 0.11 0.02 0.05 0.02 0
## [1] 150000000
Of those that sleep more than 10 hours, we can say with 95% confidence that about 27% of them work out every day. This is a significant relationship given that the only other prevalent pattern observed is 0 days at 31%.
Ho: Those who sleep 10+ hours per day are NOT more likely to strength train every day of the week Ha: Those who sleep 10+ hours per day are more likely to strength train every day of the week
type10Plus <- "10+"
binwidth = .0003
#exerciseEveryday <- yrbss %>%
# mutate(text_ind = ifelse(!is.na(strength_training_7d)
# & strength_training_7d == type7, yesLabel, noLabel))
#
#exerciseEverydayMarginOfError <- exerciseEveryday %>%
# analyzeMarginOfErrorData(binwidth = binwidth)
#
#exerciseEverydayConfidenceInerval <- exerciseEveryday %>%
# analyzeConfidenceInterval
#sleep10OrMoreHours <- yrbss %>%
# mutate(text_ind = ifelse(!is.na(school_night_hours_sleep)
# & school_night_hours_sleep == type10Plus, yesLabel, noLabel))
#
#sleep10OrMoreHoursMarginOfError <- sleep10OrMoreHours %>%
# analyzeMarginOfErrorData(binwidth = binwidth)
#
#sleep10OrMoreHoursConfidenceInerval <- sleep10OrMoreHours %>%
# analyzeConfidenceInterval
exerciseEverydayAndSleep10OrMoreHours <- yrbss %>%
filter(strength_training_7d == type7) %>%
mutate(text_ind = ifelse(!is.na(school_night_hours_sleep)
& school_night_hours_sleep == type10Plus, yesLabel, noLabel))
exerciseEverydayAndSleep10OrMoreHoursMarginOfError <- exerciseEverydayAndSleep10OrMoreHours %>%
analyzeMarginOfErrorData(binwidth = smallBinwidth)## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 2001 0.960
## 2 yes 84 0.0403
## [1] "Margin of Error: 0.0429089098251224"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 91 0.0436
## 2 2 yes 103 0.0494
## 3 3 yes 106 0.0508
## 4 4 yes 98 0.0470
## 5 5 yes 84 0.0403
## 6 6 yes 89 0.0427
## 7 7 yes 77 0.0369
## 8 8 yes 88 0.0422
## 9 9 yes 90 0.0432
## 10 10 yes 80 0.0384
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.04 0 0.04 0.04 0 0.03 0.06 0.03 0.09 0.01 0
## [1] 31275000
exerciseEverydayAndSleep10OrMoreHoursConfidenceInerval <- exerciseEverydayAndSleep10OrMoreHours %>%
analyzeConfidenceInterval## # A tibble: 1 × 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.0317 0.0484
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 2 0.04 0.01 0.04 0.04 0.01 0.03 0.05 0.02 0 -2.75 0.01
sleep10OrMoreHoursAndExerciseEveryday <- yrbss %>%
filter(school_night_hours_sleep == type10Plus) %>%
mutate(text_ind = ifelse(!is.na(strength_training_7d)
& strength_training_7d == type7, yesLabel, noLabel))
sleep10OrMoreHoursAndExerciseEverydayMarginOfError <- sleep10OrMoreHoursAndExerciseEveryday %>%
analyzeMarginOfErrorData## # A tibble: 2 × 3
## text_ind n p
## <chr> <int> <dbl>
## 1 no 232 0.734
## 2 yes 84 0.266
## [1] "Margin of Error: 0.0964021912134672"
## # A tibble: 15,000 × 4
## # Groups: replicate [15,000]
## replicate text_ind n p_hat
## <int> <chr> <int> <dbl>
## 1 1 yes 90 0.285
## 2 2 yes 84 0.266
## 3 3 yes 76 0.241
## 4 4 yes 91 0.288
## 5 5 yes 86 0.272
## 6 6 yes 95 0.301
## 7 7 yes 79 0.25
## 8 8 yes 83 0.263
## 9 9 yes 83 0.263
## 10 10 yes 71 0.225
## # … with 14,990 more rows
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15000 0.27 0.02 0.27 0.27 0.02 0.17 0.37 0.2 0.05 0 0
## [1] 4740000
sleep10OrMoreHoursAndExerciseEverydayConfidenceInerval <- sleep10OrMoreHoursAndExerciseEveryday %>%
analyzeConfidenceInterval## # A tibble: 1 × 2
## lower_ci upper_ci
## <dbl> <dbl>
## 1 0.215 0.313
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 2 0.26 0.07 0.26 0.26 0.07 0.22 0.31 0.1 0 -2.75 0.05
sleep10OrMoreHoursAndExerciseEveryday %>%
select(strength_training_7d) %>%
count(strength_training_7d) %>%
mutate(p = n / sum(n)) %>%
select(strength_training_7d, n, p)## # A tibble: 9 × 3
## strength_training_7d n p
## <int> <int> <dbl>
## 1 0 100 0.316
## 2 1 17 0.0538
## 3 2 31 0.0981
## 4 3 31 0.0981
## 5 4 18 0.0570
## 6 5 23 0.0728
## 7 6 8 0.0253
## 8 7 84 0.266
## 9 NA 4 0.0127
The null hypothesis is rejected when the p value is less than or equal to the significance level, which in this case is .05, or 5%. It represents how often we might mistakenly accept the null hypothesis in this scenario
Using a proportion of 50% to get the highest possible sample size and a margin of error of 1%, our ideal sample size would be 9604
((1.96 ^ 2) * (0.5 * 0.5)) / 0.01 ^ 2 ## [1] 9604